# if(!require(pacman)) install.packages('pacman') pacman::p_load(dplyr,
# insight, lubridate, readr, downloader, readxl, RCurl, writexl, tidyr,
# stringr, tibble, htmlTable, devtools, roxygen2, plotly, ggnewscale, ggplot2,
# fasstr, lme4, sjmisc, mgcv, gridExtra, ggfortify, visreg, formatR, sf,
# ggrepel, reshape, grid, glmmTMB, remotes, merTools, GGally, plyr, imputeTS,
# ggpubr, geojsonio, mapview)
if (!require(pacman)) install.packages("pacman")
pacman::p_load(insight, RCurl, writexl, htmlTable, devtools, roxygen2, ggnewscale,
visreg, formatR, reshape, glmmTMB, remotes, merTools, GGally, imputeTS, sf, geojsonio,
mapview, plyr, ggfortify)
remotes::install_github("aquaMetrics/rict")
# for development install.packages('devtools') library(devtools)
# devtools::load_all(export_all = FALSE) install_deps() for public version
# (once live)
remotes::install_github("APEM-LTD/hetoolkit")
library(hetoolkit)
The hetoolkit package comprises a collection of 21
functions for assembling, processing, visualising and modelling
hydro-ecological data. These are:
import_nrfa for importing flow data from the National
River Flow Archive (NRFA);import_hde for importing flow data from the Environment
Agency (EA) Hydrology Data Explorer (HDE);import_flowfiles for importing flow data from local
files;import_flow for importing flow data from a mix of the
above sources;impute_flow for infilling missing records in daily flow
time series for one or more sites (gauging stations) using either an
interpolation or an equipercentile method.import_inv for importing macroinvertebrate sampling
data from the EA Ecology and Fish Data Explorer;import_env for importing environmental base data from
the EA Ecology and Fish Data Explorer;import_rhs for importing River Habitat Survey (RHS)
data from the EA’s Open Data portal;predict_indices for calculating expected scores for
macroinvertebrate indices using the RICT model (FBA 2020);calc_flowstats and calc_rfrstats for
calculating summary statistics describing historical flow
conditions;join_he for joining the above datasets;plot_heatmap for visualising and summarising gaps in
time series data;plot_hev and shiny_hev for producing time
series plots of biology and flow data;plot_sitepca for summarising environmental
characteristics of biological sampling sites;plot_rngflows for Visualising the range of flow
conditions experienced historically at a site;model_cv and model_logocv for performing
cross-validation on linear mixed-effects models and hierarchical
generalized additive models;diag_lmer for generating a variety of diagnostic plots
for a mixed-effects regression (lmer) model; andplot_predictions for visualising the time series
predictions from a hydro-ecological model.This vignette illustrates a typical workflow using a selection of 20 macroinvertebrate sampling sites from the Environment Agency’s National Drought Monitoring Network (NDMN).
Although the package has been developed with macroinvertebrate data in mind, the functions can be used with any kind of biological sampling data.
To link together disparate datasets requires a look-up table of site ids. In this example, we load a table with four columns:
# load master file
data("master_file")
# make all columns character vectors
master_file$biol_site_id <- as.character(master_file$biol_site_id)
master_file$rhs_survey_id <- as.character(master_file$rhs_survey_id)
# filter master file for selected sites of interest
master_data <- master_file %>%
filter(biol_site_id %in% c("34310", "34343", "34352", "55287", "55395", "55417",
"55673", "55824", "55897", "56065", "56226", "54637", "54769", "54801", "54962",
"80998", "56491", "54827", "77216", "52828"))
# view data
master_data
# get site lists, for use with functions
biolsites <- master_data$biol_site_id
flowsites <- master_data$flow_site_id
flowinputs <- master_data$flow_input
rhssurveys <- master_data$rhs_survey_id
A number of standardised column names are used throughout the
hetoolkit package, and throughout this vignette and its
associated datasets. These include:
import_flow functionThe import_inv function imports macroinvertebrate
sampling data from the Environment Agency’s Ecology and Fish Data
Explorer. The data can either be downloaded from https://environment.data.gov.uk/ecology-fish/downloads/INV_OPEN_DATA.zip
or read in from a local .csv or .rds file. The data can be optionally
filtered by site ID and sample date.
Below, we use our list biolsites to filter the data from
EDE.
# Import biology data from EDE
biol_data <- import_inv(source = "parquet", sites = biolsites, start_date = "2010-01-01",
end_date = "2020-12-31")
# view biol_data
biol_data
If the user has additional biology data in a separate Excel file, it is possible to append this to the EDE download. The additional data must have the same column names as the EDE download file.
# bind 2 biology data sets - one from EDE and one local file
# drop any unwanted variables/columns from the EDE download file
drops_bio <- c("SAMPLE_VERSION", "REPLICATE_CODE", "SAMPLE_TYPE", "SAMPLE_METHOD",
"ANALYSIS_TYPE", "ANALYSIS_METHOD", "IS_THIRD_PARTY_DATA", "WATERBODY_TYPE")
# drop unwanted variables
biol_data2 <- biol_data[, !(names(biol_data) %in% drops_bio)]
# read in additional biology data in csv format
biol_data_excel <- read.csv("data/biol_data_join.csv")
# format columns
biol_data_excel <- biol_data_excel %>%
dplyr::mutate(biol_site_id = as.character(biol_site_id))
# convert to tibble format
biol_data_excel <- as_tibble(biol_data_excel)
# bind datasets
biol_data_final <- rbind(biol_data2, biol_data_excel)
The import_env function allows the user to download
environmental base data from the Environment Agency’s Ecology and Fish
Data Explorer.
The function either:
Data can be optionally filtered by site ID.
When saving, the name of rds file is hard-wired to: INV_OPEN_DATA_SITES_ALL.rds.
If saving prior to filtering, the name of the filtered rds file is hard-wired to: INV_OPEN_DATA_SITE_F.rds.
Below, we use our list biolsites to filter the data from
EDE.
# Import biology data from EDE
env_data <- import_env(sites = biolsites)
# view env_data
env_data
First we download data for our basemap. The England map is of EA public facing area boundaries.
We use the environmental base data that we have downloaded from the
Ecology and Fish Data Explorer using import_env, this gives
us their NGRs. We translate the NGRs to full latitude / longitude
(WGS84) and match this back to the env_data so we have information to
include in the plot.
# Get EA public facing area boundaries
url_request <- "https://environment.data.gov.uk/arcgis/rest/services/EA/AdminBoundEAandNEpublicFaceAreas/FeatureServer/0/query?where=seaward='No'&outFields=*&f=geojson"
ea.areas <- st_read(url_request)
## Reading layer `OGRGeoJSON' from data source
## `https://environment.data.gov.uk/arcgis/rest/services/EA/AdminBoundEAandNEpublicFaceAreas/FeatureServer/0/query?where=seaward='No'&outFields=*&f=geojson'
## using driver `GeoJSON'
## Simple feature collection with 14 features and 10 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -6.419008 ymin: 49.86463 xmax: 1.768937 ymax: 55.81166
## Geodetic CRS: WGS 84
## Convert national grid ref (NGR) to full lat / long from env_data (from import_env function)
## WGS84 is lat/long.
temp.eastnorths <- osg_parse(env_data$NGR_10_FIG, coord_system = "WGS84") %>% as_tibble()
## match to back to env data to give details on map
env_data_map <- cbind(env_data, temp.eastnorths) %>%
dplyr::select(AGENCY_AREA, WATER_BODY, CATCHMENT, WATERBODY_TYPE, biol_site_id, lat, lon)
Finally we use mapview to plot the EA areas and points indicating the sample sites. The points and polygons are labelled with the biology sample site ID and the EA area code respectively. More data for each site is available by clicking on the point.
## Create map
mapview(ea.areas, alpha.regions = 0.2, label=ea.areas$code) +
mapview(env_data_map, xcol = "lon", ycol = "lat", label=env_data_map$biol_site_id, grid = FALSE)